{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 挑战 25：各国历年二氧化碳 CO2 排放量统计分析"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 数据清洁"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# 读取数据文件\n",
    "df_data = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Data')\n",
    "df_country = pd.read_excel(\"ClimateChange.xlsx\", sheet_name='Country')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理 data 数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选取 EN.ATM.CO2E.KT 数据，并将国家代码设置为索引\n",
    "\n",
    "df_data_reindex = df_data[df_data['Series code']== 'EN.ATM.CO2E.KT'].set_index('Country code')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 剔除不必要的数据列\n",
    "df_data_drop = df_data_reindex.drop(labels=['Country name', 'Series code', 'Series name', 'SCALE', 'Decimals'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data_nan = df_data_drop.replace({'..': pd.np.NaN})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 对 NaN 空值进行向前和向后填充\n",
    "df_data_fill = df_data_nan.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 对填充后依旧全部为空值的数据行进行剔除\n",
    "df_data_dropna = df_data_fill.dropna(how='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "### 处理 Country 数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将国家代码设置为索引\n",
    "df_country_reindex = pd.DataFrame(df_country).set_index('Country code')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 剔除不必要的数据列\n",
    "df_country_drop = df_country_reindex.drop(labels=['Capital city', 'Region', 'Lending category'], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "### 合并数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对 Data 和 Country 表按照索引进行合并\n",
    "df_combine = pd.concat([df_data_dropna, df_country_drop], axis=1, sort=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对合并后数据集进行求和得到各国排放总量\n",
    "df_combine['Sum emissions'] = df_combine[list(df_combine)[:-2]].sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 对合并后存在空值的数据行进行剔除，得到清洁后的数据集\n",
    "df_clean = df_combine.dropna(thresh=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 数据求和整理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 按收入群体对数据进行求和"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按收入群体对数据进行求和\n",
    "sum_by_groups = df_clean.groupby('Income group')['Sum emissions'].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 按要求整理 DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按要求整理 DataFrame\n",
    "item_high_list = []\n",
    "item_low_list = []\n",
    "\n",
    "for group_name in list(sum_by_groups.index):\n",
    "\n",
    "    # 得到各收入群体最高排放量数据\n",
    "    item_high = df_clean[df_clean['Income group'] == group_name].sort_values(\n",
    "        by='Sum emissions', ascending=False).iloc[0]\n",
    "\n",
    "    # 将最高排放量数据存入相应列表方便生成最终 DataFrame\n",
    "    item_high_list.append(\n",
    "        (item_high['Income group'], item_high['Country name'], item_high['Sum emissions']))\n",
    "\n",
    "    # 得到各收入群体最低排放量数据\n",
    "    item_low = df_clean[df_clean['Income group'] ==\n",
    "                        group_name].sort_values(by='Sum emissions').iloc[0]\n",
    "\n",
    "    # 将最低排放量数据存入相应列表方便生成最终 DataFrame\n",
    "    item_low_list.append(\n",
    "        (item_low['Income group'], item_low['Country name'], item_low['Sum emissions']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合并输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sum emissions</th>\n",
       "      <th>Highest emission country</th>\n",
       "      <th>Highest emissions</th>\n",
       "      <th>Lowest emission country</th>\n",
       "      <th>Lowest emissions</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Income group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>High income: OECD</th>\n",
       "      <td>2.588373e+08</td>\n",
       "      <td>United States</td>\n",
       "      <td>1.179918e+08</td>\n",
       "      <td>Iceland</td>\n",
       "      <td>46944.934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>High income: nonOECD</th>\n",
       "      <td>1.581112e+07</td>\n",
       "      <td>Saudi Arabia</td>\n",
       "      <td>7.009760e+06</td>\n",
       "      <td>Turks and Caicos Islands</td>\n",
       "      <td>1503.470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Low income</th>\n",
       "      <td>5.485641e+06</td>\n",
       "      <td>Korea, Dem. Rep.</td>\n",
       "      <td>3.104479e+06</td>\n",
       "      <td>Comoros</td>\n",
       "      <td>2068.188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lower middle income</th>\n",
       "      <td>6.272726e+07</td>\n",
       "      <td>India</td>\n",
       "      <td>2.681828e+07</td>\n",
       "      <td>Kiribati</td>\n",
       "      <td>601.388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Upper middle income</th>\n",
       "      <td>2.100775e+08</td>\n",
       "      <td>China</td>\n",
       "      <td>9.809777e+07</td>\n",
       "      <td>Niue</td>\n",
       "      <td>80.674</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Sum emissions Highest emission country  \\\n",
       "Income group                                                   \n",
       "High income: OECD      2.588373e+08            United States   \n",
       "High income: nonOECD   1.581112e+07             Saudi Arabia   \n",
       "Low income             5.485641e+06         Korea, Dem. Rep.   \n",
       "Lower middle income    6.272726e+07                    India   \n",
       "Upper middle income    2.100775e+08                    China   \n",
       "\n",
       "                      Highest emissions   Lowest emission country  \\\n",
       "Income group                                                        \n",
       "High income: OECD          1.179918e+08                   Iceland   \n",
       "High income: nonOECD       7.009760e+06  Turks and Caicos Islands   \n",
       "Low income                 3.104479e+06                   Comoros   \n",
       "Lower middle income        2.681828e+07                  Kiribati   \n",
       "Upper middle income        9.809777e+07                      Niue   \n",
       "\n",
       "                      Lowest emissions  \n",
       "Income group                            \n",
       "High income: OECD            46944.934  \n",
       "High income: nonOECD          1503.470  \n",
       "Low income                    2068.188  \n",
       "Lower middle income            601.388  \n",
       "Upper middle income             80.674  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 设置 DataFrame 标签\n",
    "high_labels = ['Income group', 'Highest emission country', 'Highest emissions']\n",
    "low_labels = ['Income group', 'Lowest emission country', 'Lowest emissions']\n",
    "\n",
    "# 生成并合并目标 DataFrame\n",
    "highest_df = pd.DataFrame.from_records(item_high_list, columns=high_labels).set_index('Income group')\n",
    "lowest_df = pd.DataFrame.from_records(item_low_list, columns=low_labels).set_index('Income group')\n",
    "\n",
    "results = pd.concat([sum_by_groups, highest_df, lowest_df], axis=1)\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 解法 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从清洁后数据 `df_clean` 开始："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Highest emission country</th>\n",
       "      <th>Highest emissions</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Income group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>High income: OECD</th>\n",
       "      <td>United States</td>\n",
       "      <td>1.179918e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Upper middle income</th>\n",
       "      <td>China</td>\n",
       "      <td>9.809777e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lower middle income</th>\n",
       "      <td>India</td>\n",
       "      <td>2.681828e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>High income: nonOECD</th>\n",
       "      <td>Saudi Arabia</td>\n",
       "      <td>7.009760e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Low income</th>\n",
       "      <td>Korea, Dem. Rep.</td>\n",
       "      <td>3.104479e+06</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Highest emission country  Highest emissions\n",
       "Income group                                                    \n",
       "High income: OECD               United States       1.179918e+08\n",
       "Upper middle income                     China       9.809777e+07\n",
       "Lower middle income                     India       2.681828e+07\n",
       "High income: nonOECD             Saudi Arabia       7.009760e+06\n",
       "Low income                   Korea, Dem. Rep.       3.104479e+06"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_max = df_clean.sort_values(by='Sum emissions', ascending=False).drop_duplicates(\n",
    "    'Income group').set_index('Income group')[['Country name', 'Sum emissions']]\n",
    "df_max.columns = ['Highest emission country','Highest emissions']\n",
    "df_max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Lowest emission country</th>\n",
       "      <th>Lowest emissions</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Income group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Upper middle income</th>\n",
       "      <td>Niue</td>\n",
       "      <td>80.674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lower middle income</th>\n",
       "      <td>Kiribati</td>\n",
       "      <td>601.388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>High income: nonOECD</th>\n",
       "      <td>Turks and Caicos Islands</td>\n",
       "      <td>1503.470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Low income</th>\n",
       "      <td>Comoros</td>\n",
       "      <td>2068.188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>High income: OECD</th>\n",
       "      <td>Iceland</td>\n",
       "      <td>46944.934</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Lowest emission country  Lowest emissions\n",
       "Income group                                                    \n",
       "Upper middle income                       Niue            80.674\n",
       "Lower middle income                   Kiribati           601.388\n",
       "High income: nonOECD  Turks and Caicos Islands          1503.470\n",
       "Low income                             Comoros          2068.188\n",
       "High income: OECD                      Iceland         46944.934"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_min = df_clean.sort_values(by='Sum emissions', ascending=True).drop_duplicates(\n",
    "    'Income group').set_index('Income group')[['Country name', 'Sum emissions']]\n",
    "df_min.columns = ['Lowest emission country', 'Lowest emissions']\n",
    "df_min"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sum emissions</th>\n",
       "      <th>Highest emission country</th>\n",
       "      <th>Highest emissions</th>\n",
       "      <th>Lowest emission country</th>\n",
       "      <th>Lowest emissions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>High income: OECD</th>\n",
       "      <td>2.588373e+08</td>\n",
       "      <td>United States</td>\n",
       "      <td>1.179918e+08</td>\n",
       "      <td>Iceland</td>\n",
       "      <td>46944.934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>High income: nonOECD</th>\n",
       "      <td>1.581112e+07</td>\n",
       "      <td>Saudi Arabia</td>\n",
       "      <td>7.009760e+06</td>\n",
       "      <td>Turks and Caicos Islands</td>\n",
       "      <td>1503.470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Low income</th>\n",
       "      <td>5.485641e+06</td>\n",
       "      <td>Korea, Dem. Rep.</td>\n",
       "      <td>3.104479e+06</td>\n",
       "      <td>Comoros</td>\n",
       "      <td>2068.188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lower middle income</th>\n",
       "      <td>6.272726e+07</td>\n",
       "      <td>India</td>\n",
       "      <td>2.681828e+07</td>\n",
       "      <td>Kiribati</td>\n",
       "      <td>601.388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Upper middle income</th>\n",
       "      <td>2.100775e+08</td>\n",
       "      <td>China</td>\n",
       "      <td>9.809777e+07</td>\n",
       "      <td>Niue</td>\n",
       "      <td>80.674</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Sum emissions Highest emission country  \\\n",
       "High income: OECD      2.588373e+08            United States   \n",
       "High income: nonOECD   1.581112e+07             Saudi Arabia   \n",
       "Low income             5.485641e+06         Korea, Dem. Rep.   \n",
       "Lower middle income    6.272726e+07                    India   \n",
       "Upper middle income    2.100775e+08                    China   \n",
       "\n",
       "                      Highest emissions   Lowest emission country  \\\n",
       "High income: OECD          1.179918e+08                   Iceland   \n",
       "High income: nonOECD       7.009760e+06  Turks and Caicos Islands   \n",
       "Low income                 3.104479e+06                   Comoros   \n",
       "Lower middle income        2.681828e+07                  Kiribati   \n",
       "Upper middle income        9.809777e+07                      Niue   \n",
       "\n",
       "                      Lowest emissions  \n",
       "High income: OECD            46944.934  \n",
       "High income: nonOECD          1503.470  \n",
       "Low income                    2068.188  \n",
       "Lower middle income            601.388  \n",
       "Upper middle income             80.674  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([sum_by_groups, df_max, df_min], sort=False, axis=1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
